1 Preparatory steps for the Markdown file

Setting the options for knitr

library(knitr)
knitr::opts_chunk$set(results = 'asis',
                       echo = TRUE,
                       comment = NA,
                       prompt  = FALSE,
                       cache   = FALSE,
                       warning = FALSE,
                       message = FALSE,
                       fig.align = "center",
                       fig.width = 8.125,
                       out.width = "100%",
                       fig.path = "Figures/figures_analysis_perfs/",
                       dev = c('png', 'tiff'),
                       dev.args = list(png = list(type = "cairo"), tiff = list(compression = 'lzw')),
                       cache = TRUE,
                       cache.path = "D:/cache/cache_analysis_perf/",
                       autodep = TRUE)
options(width = 1000, scipen = 999, knitr.kable.NA = '')

Loading libraries…

library(tidyverse) # The tidy verse
library(reshape2)
library(caret) # To ease classification training
library(gridExtra)
library(mlr)
library(rlist)
library(kableExtra)

Cleaning the memory…

rm(list = ls(all.names = TRUE))

We define a ‘not in’ operator…

'%!in%' <- Negate("%in%")

2 Loading R objects previously defined

We load data tables and definitions of features sets prepared during the curation phase

load(file = "Study_objects.RData")

3 Preparing objects

We define two vectors to rank individuals and call types by decreasing order of number of calls…

decreasing_order_nb_calls_by_individual <- df %>% 
  group_by(individual) %>%
  summarize(n = n()) %>%
  arrange(-n) %>%
  pull(individual) %>%
  as.character()

decreasing_order_nb_calls_by_type <- df %>% 
  group_by(type) %>%
  summarize(n = n()) %>%
  arrange(-n) %>%
  pull(type) %>%
  as.character()

We also define tables which contain the number of calls per individual, and per call type…

df_individual <- df %>%
  pull(individual) %>%
  table() %>%
  as_tibble() %>%
  arrange(-n) %>%
  rename(Individual = names(.)[1])

df_type <- df %>%
  pull(type) %>%
  table() %>%
  as_tibble() %>%
  arrange(-n) %>%
  rename(Type = names(.)[1])

4 Defining generic functions for analysis

Since we have to analyze the performances of various classifiers i) for both individuals (i.e. individual signatures) and (call) types and ii) with different features sets, it makes sense to define a number of generic functions which we will call repeatedly later on.

4.1 Functions to load and prepare the data

We first define a function define_classifier_type_and_configuration() which takes a data table and adds two additional columns to it: one column to define the type of classifier (single classifier or ensemble classifier), and another column to define the configuration, which is the intersection of the classifier and the feature set…

define_classifier_type_and_configuration <- function(df) {
  df <- df %>%
    mutate(classifier_type = ifelse(algo == "ensemble", "ensemble", "single classifier"),
           classifier_type = as.factor(classifier_type),
           configuration = paste0(algo, " - ", set),
           configuration = as.factor(configuration))
  
  return (df)
}

We then define the function load_and_prepare_data(), which takes as parameter i) the name of a file which contains containing classification results for a target variable and a management strategy for sequences of calls, and ii) a vector of strings of characters describing an order for the levels of the target variable. The function loads and further prepares the data…

load_and_prepare_data <- function(filename, level_ordering) {
  
  # Loading the data
  load(filename)
  
  # Extracting various data from the list of results
  performances <- all_outputs %>% purrr::map_dfr("performances")
  confusion <- all_outputs %>% purrr::map_dfr("confusion")
  fimp <- all_outputs %>% purrr::map_dfr("fimp")
  sets <- all_outputs %>% purrr::map("sets")
  sequence_scores <- sets %>% purrr::map_dbl("sequence_score")

  # Renaming the metrics/measures for easier identification
  measure_names <- c("acc" = "accuracy", "auc"= "AUC", "bac" = "balanced accuracy", "kappa" = "Kappa", "logloss" = "log loss", "mmce" = "mean misclassification error")

  performances <- performances %>%
    mutate(measure_name = as.factor(measure_name), 
          measure_name = recode(measure_name, !!!measure_names))

  # Adding the factors classifier_type and configuration
  performances <- performances %>% define_classifier_type_and_configuration()
  confusion <- confusion %>% define_classifier_type_and_configuration()
  
  if (nrow(fimp) > 0) # Only if feature importance was computed
    fimp <- fimp %>% define_classifier_type_and_configuration()

  # Computing average values for performances, confusion data and feature importance (for all the iterations)
  averaged_performances <- performances %>%
    group_by(algo, set, measure_name, configuration, classifier_type) %>%
    summarize(mean = mean(measure), sd = sd(measure), .groups = "drop") %>%
    ungroup()
  
  averaged_confusion <- confusion %>%
    group_by(algo, set, prediction, reference) %>%
    summarize(n = mean(n), .groups = "drop") %>%
    ungroup()

  averaged_fimp <- NULL

  if (nrow(fimp) > 0)
    averaged_fimp <- fimp %>%
      group_by(algo, set, feature) %>%
      summarize(mean = mean(imp), sd = sd(imp), .groups = "drop") %>%
      ungroup()

  # We relevel the factors prediction and reference in confusion and averaged_confusion
  confusion <- confusion %>% mutate(prediction = factor(prediction, levels = level_ordering),
                                    reference = factor(reference, levels = level_ordering))

  averaged_confusion <- averaged_confusion %>% mutate(prediction = factor(prediction, levels = level_ordering),
                                                      reference = factor(reference, levels = level_ordering))

  # We return a list of objects
  
  return (list(performances = performances,
               averaged_performances = averaged_performances,
               confusion = confusion,
               averaged_confusion = averaged_confusion,
               fimp = fimp,
               averaged_fimp = averaged_fimp,
               sequence_scores = sequence_scores))
}

4.2 Functions to create histograms of metric values

The function get_histograms_9_configurations() create a faceted figure which contains 9 histograms - 3 classifiers (SVM, NN, xgboot) x 3 feature sets(Bioacoustic, DCT, MFCC) - summarizing the classification performances as defined by a target metric/measure…

get_histograms_9_configurations <- function(performances_table, target_measure) {

  p <- performances_table %>% 
    filter(algo != "ensemble", measure_name == target_measure) %>%
    ggplot(aes(x = measure)) + 
    geom_histogram(bins = 20) +
    geom_density() +
    facet_grid(algo ~ set) +
    xlab(target_measure) +
    labs(title = paste0("Histograms of ", target_measure, " values"), subtitle = "Different classifiers and feature sets")

  return (p)
}

4.3 Functions to create barplots of performances

The function get_barplots_comparison_9_configurations() creates a barplot which displays the average classification performances - as assessed by a target measure/metric - for 9 configurations - 3 classifiers (SVM, NN, xgboot) x 3 feature sets(Bioacoustic, DCT, MFCC)…

get_barplots_comparison_9_configurations <- function(performances_table, target_measure) {
  
  title <- paste0("Performances with the ", target_measure, " metric")

  p <- performances_table %>% 
    filter(algo != "ensemble", measure_name == target_measure) %>%
    ggplot(aes(x = set, y = mean, fill = algo)) +
    geom_bar(stat = "identity", col = "black", position = position_dodge()) +
    geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
    geom_text(aes(label = round(mean, 3)), hjust = 2.0, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
    labs(title = title, subtitle = "Different classifiers and feature sets", fill = "Classifier") +
    theme_minimal() + coord_flip() + ylab(target_measure) + xlab("Feature set")

  return (p)
}

The function get_barplots_comparison_all_configurations_by_classifier_type() creates a barplot which displays the average classification performances - as assessed by a target measure/metric - for all configurations present in the provided table, with a 2-color scheme corresponding to whether a classifier is an ensemble one or not…

get_barplots_comparison_all_configurations_by_classifier_type <- function(performances_table, target_measure) {

  title <- paste0("Performances with the ", target_measure, " metric")

  p <- performances_table %>%
    filter(measure_name == target_measure) %>%
    ggplot(aes(x = reorder(configuration, mean), y = mean, fill = classifier_type)) +
    geom_bar(stat = "identity", col = "black", position = position_dodge()) +
    geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
    geom_text(aes(label = round(mean, 3)), hjust = 2.0, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
    labs(title = title, subtitle = "Different configurations", fill = "Classifier type") +
    theme_minimal() + coord_flip() + ylab(target_measure) + xlab("Configuration") +
    ggtitle(title)
  
  return (p)
}

The function get_barplots_comparison_all_configurations_by_classifier_type() creates a barplot which displays the average classification performances - as assessed by a target measure/metric - for all configurations present in the provided table, with a color scheme representing the different feature sets…

get_barplots_comparison_all_configurations_by_set <- function(performances_table, target_measure) {

  title <- paste0("Performances with the ", target_measure, " metric")

  p <- performances_table %>%
    filter(measure_name == target_measure) %>%
    ggplot(aes(x = reorder(configuration, mean), y = mean, fill = set)) +
    geom_bar(stat = "identity", col = "black", position = position_dodge()) +
    geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
    geom_text(aes(label = round(mean, 3)), hjust = 2.0, color = "white", position = position_dodge(width = 0.9)) +
    labs(title = title, subtitle = "Different configurations", fill = "Feature set") +
    theme_minimal() + coord_flip() + ylab(target_measure) + xlab("Configuration") +
    ggtitle(title)
  
  return (p)
}

4.4 Functions to display confusion matrices

The function get_confusion_matrix() creates a well-formatted figure to display the confusion matrix for a specific classifier (algo) and a specific feature set. One can choose whether to display raw figures in addition to row-wise percentages in the cells of the matrix…

get_confusion_matrix <- function(confusion_table, target_algo, target_set, display_raw_numbers = T, title = "") {

  if (title == "")
    title <- paste0("Confusion matrix - ", target_algo, " and ", target_set)
  
   confusion_table <- confusion_table %>%
     filter(algo == target_algo, set == target_set) %>%
     select(-algo, -set)
  
  cf_t <- confusion_table %>%
    group_by(reference) %>% summarize(n_ref = sum(n)) %>%
    ungroup()
  
  confusion_table <- confusion_table %>%
    inner_join(cf_t, by = c("reference" = "reference")) %>%
    mutate(percent = 100 * n / n_ref)
  
  if (display_raw_numbers)
    confusion_table <- confusion_table %>% mutate(legend = paste0(round(n, 1), " (", round(percent, 1), " %)"))
  else
    confusion_table <- confusion_table %>% mutate(legend = paste0(round(percent, 1), " %"))

  p <- confusion_table %>%
    ggplot(aes(x = prediction, y = reference, fill = percent)) + 
    geom_tile(show.legend = FALSE) +
    geom_text(aes(label = legend), size = 3.5) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_fill_distiller(palette = "YlOrBr", direction = 1) +
    ggtitle(title) 
  
  return (p)
}

The function get_confusion_matrix_9_configurations() creates a faceted figure to display 9 confusion matrices corresponding to 9 different configurations - 3 classifiers (SVM, NN, xgboot) x 3 feature sets(Bioacoustic, DCT, MFCC). One can choose whether to display raw figures in addition to row-wise percentages in the cells of the matrices…

get_confusion_matrix_9_configurations <- function(confusion_table) {

  p <- confusion_table %>%
    filter(algo != "ensemble") %>%
    ggplot(aes(x = prediction, y = reference, fill = n)) + 
    geom_tile(show.legend = FALSE) +
    #geom_text(aes(label = round(n, 1))) +
    theme(axis.text.x=element_text(angle = 45, hjust = 1)) +
    facet_grid(set ~ algo) +
    scale_fill_distiller(palette = "YlOrBr", direction = 1) +
    ggtitle("Confusion matrices for the different configurations") 
  
  return (p)
}

4.5 Functions to create barplots for feature importance

The function get_fimp_barplot() creates a barplot to display the importance of the different features of a given feature set, for a given classifier…

get_fimp_barplot <- function(fimp_table, target_algo, target_set) {

  title <- paste0(target_set, " - ", target_algo)

  p <- fimp_table %>%
    filter(algo == target_algo, set == target_set) %>% droplevels() %>%
    ggplot(aes(x = reorder(feature, mean), y = mean)) +
    geom_bar(stat = "identity", col = "black") +
    geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2) +
    # geom_text(aes(label = round(mean, 3)), hjust = 1.8, color = "black", size = value_text_size) +
    xlab("Feature") + ylab("Importance") +
    theme_minimal() + coord_flip() +
    ggtitle(title)
  
  return (p)
}

5 Analyzing the performances for individual signatures

We first load the results of the assessments of the different models, and create different data tables…

outputs_individual <- load_and_prepare_data("Result files/results - individual - default.RData", decreasing_order_nb_calls_by_individual)

5.1 Analyzing the performances of the three basic models

5.1.1 Looking at the distribution of values for the performances

We have conducted 100 repetitions of the same sequence: i) create a training-test partition randomly, ii) train a model with the training set, iii) assess the model with the test set. We can look at the distribution of values of the different metrics across the repetitions.

For logloss:

outputs_individual$performances %>% get_histograms_9_configurations("log loss")

While the histograms and density curves look sometimes not very regular, one can imagine that more repetitions would gradually lead to a smooth and symmetrical distribution, possibly a Gaussian one. The range of variation of the logloss values is quite significant in each configuration, which suggests it was a good idea to consider a number of repetitions.

For multi-class AUC, balanced accuracy and accuracy:

outputs_individual$performances %>% get_histograms_9_configurations("AUC")

outputs_individual$performances %>% get_histograms_9_configurations("balanced accuracy")

outputs_individual$performances %>% get_histograms_9_configurations("accuracy")

We reach the same conclusions as previously with log loss.

5.1.2 Assessing the performances with different metrics

We first focus on log loss, which is the metric which was used for the tuning of the hyper-parameters of our different classifiers. We display the average performances for each classifier and each set of predictors:

outputs_individual$averaged_performances %>% get_barplots_comparison_9_configurations("log loss")

We can make a number of observations:

  • One can clearly see that the set of predictors can be ordered from worse to best in terms of performances, with DCT coefficients offering the worst performances and MFCC the best ones. This is true regardless of the classifying algorithm
  • xgboost offer the best performances, while nn and svm are very close. This is clear for Bioacoustic parameters and MFCC, but a bit less for DCT coefficients, since nn is then on par with xgboost, while svm is a bit worse.
  • Consequently, the best performances are obtained with xgboost and MFCC. They are 5.5% better than those of the second best option - nn with MFCC.

We can confirm these results with two other metrics: multi-class AUC and balanced accuracy

p1 <- outputs_individual$averaged_performances %>% get_barplots_comparison_9_configurations("AUC")
p2 <- outputs_individual$averaged_performances %>% get_barplots_comparison_9_configurations("balanced accuracy")
grid.arrange(p1, p2, ncol = 2)

While AUC gives nearly the same ranking between the 9 conditions as log loss, there are some differences with balanced accuracy. We can, however, still conclude that whatever the measure, xgboostand MFCC offer the best performances, especially when considered together. DCT parameters, on the contrary, lead to the worst performances.

We do not observe any specific pattern for the standard deviations.

5.2 Confusion matrices for the three basic models

We can display as a reminder the individuals ranked by the number of calls for each of them:

df_individual %>% mutate(n = text_spec(n, color = "white", align = "r", background = spec_color(n, option = "D", scale_from = c(0, 400)))) %>%
  kbl(digits = 3, escape = F) %>%
  kable_classic() %>%
  kable_styling(full_width = F)
Individual n
Jill 362
Zuani 257
Zamba 193
Vifijo 160
Djanoa 145
Hortense 111
Kumbuka 109
Bolombo 76
Lina 76
Busira 71

We can display the confusion matrix for our best case scenario, when xgboost is considered with MFCC:

outputs_individual$averaged_confusion %>% get_confusion_matrix("xgboost", "MFCC", display_raw_numbers = F)

We observe that the more calls for an individual, the better it tends to be correctly recognized (although the trend is not linear), despite accounting to class unbalance with weights. ‘Heavier’ individuals (in terms of number of calls) also tend to attract classification outputs, although it is really clearly the case only for Jill.

If one pays attention to the individuals with the smallest number of calls, Bolombo is the most poorly classified, while the situation is a bit better for Busira, Kubumka and especially Lina.

We can also display confusion matrices for our 9 configurations:

outputs_individual$averaged_confusion %>% get_confusion_matrix_9_configurations()

The patterns of correct and incorrect classifications seem to be the same in all 9 configurations. There does not seem to be a differential pattern in terms of improvement for individuals with more or less calls. To further investigate, we can draw side by side the two confusion matrices for the best (xgboost - MFCC) and worst (svm - DCT) configurations:

p_xgboost_MFCC <- outputs_individual$averaged_confusion %>% get_confusion_matrix("xgboost", "MFCC", display_raw_numbers = F)
p_svm_DCT <- outputs_individual$averaged_confusion %>% get_confusion_matrix("svm", "DCT", display_raw_numbers = F)

grid.arrange(p_xgboost_MFCC, p_svm_DCT, ncol = 2)

We can see that all the individuals are better predicted with the better configuration (when we could have observed small violations to this observation for specific individuals). The better system therefore seems to be uniformly better.

5.3 Investigating feature importance

For the Bioacoustic and DCT feature sets, we can look at the importance of the various features for different classifying algorithms. We leave MFCC aside as there are too many features which are not really meaningful when considered in isolation.

We first look at the DCT set:

p1 <- outputs_individual$averaged_fimp %>% get_fimp_barplot("svm", "DCT")
p2 <- outputs_individual$averaged_fimp %>% get_fimp_barplot("xgboost", "DCT")
p3 <- outputs_individual$averaged_fimp %>% get_fimp_barplot("nn", "DCT")
grid.arrange(p1, p2, p3, ncol = 3)

The 3 algorithms all suggest that dct0 is the feature playing the most important role. dct1, dct2, dct4 and duration come next, with different rankings depending on the classifying algorithm and quite close to each other altogether. vocalization.HNR and dct3 are a bit weaker in all three approaches.

We then look at the Bioacoustic set:

p1 <- outputs_individual$averaged_fimp %>% get_fimp_barplot("svm", "Bioacoustic")
p2 <- outputs_individual$averaged_fimp %>% get_fimp_barplot("xgboost", "Bioacoustic")
p3 <- outputs_individual$averaged_fimp %>% get_fimp_barplot("nn", "Bioacoustic")
grid.arrange(p1, p2, p3, ncol = 3)

We observe that duration is consistently the most important feature for prediction of the individual. vocalization.HNR also plays an important role, and other parameters like q1t also seems to be a bit more important than the others, although it is not very clear.

It is interesting to note that while duration is the most important feature for DCT, it is not for Bioacoustic.

p_a <- outputs_individual$averaged_fimp %>% get_fimp_barplot("xgboost", "Bioacoustic")
p_b <- outputs_individual$averaged_fimp %>% get_fimp_barplot("xgboost", "DCT")
grid.arrange(p_a, p_b, ncol = 2)

5.4 Analyzing the performances of the ensemble models

5.4.1 Assessing the performances with different metrics

We can look at the performances of our different ensemble models, and compare them to those of our initial / primary classifiers. We first focus on log loss, since it is the metric which was used during the tuning of the hyper-parameters of the different classifiers:

outputs_individual$averaged_performances %>% get_barplots_comparison_all_configurations_by_classifier_type("log loss")

Several points can be highlighted:

  • ensembles / stacked learners tend to perform better than single classifiers
  • When it comes to combining the different primary classifiers for a given set of predictors, we observe first that the performances of the stacked learners are always better, with a larger improvement for MFCC coefficients than for Bioacoustic parameters and DCT coefficients.
  • When we assemble the various versions of a classifier for different sets of predictors, we very strongly improve the performances. This improvement is stronger than what we observe when we assemble across the classifiers.
  • Finally, when we assemble all 9 primary learners (3 classifiers x 3 sets), we obtain the best performances.

We can compute and display the increase in performance with respect to the worst configuration (svm and DCT):

outputs_individual$averaged_performances %>% filter(measure_name == "log loss") %>%
  mutate(improvement_percentage = round(100 * (max(mean) - mean) / max(mean), 2)) %>%
  ggplot(aes(x = reorder(configuration, improvement_percentage), y = improvement_percentage, fill = classifier_type)) +
  geom_bar(stat = "identity", col = "black", position=position_dodge()) +
  geom_text(aes(label = round(improvement_percentage, 1)), hjust = 1.2, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
  labs(fill = "Classifier type") +
  theme_minimal() + coord_flip() + ylab("log loss") + xlab("Feature set") + ggtitle("Comparison of the performances with the logloss metric")

The increase in performance is very clear between the worst and best configurations - 23.0%.

We can further look at other metrics:

outputs_individual$averaged_performances %>% get_barplots_comparison_all_configurations_by_classifier_type("AUC")

outputs_individual$averaged_performances %>% get_barplots_comparison_all_configurations_by_classifier_type("balanced accuracy")

The AUC values do not reveal the improvement in performances as clearly as other metrics, since the values are all very high - in other words, there is little ‘differentiation power’. However, there is no result contradicting what was observed with logloss. The same is observed with balanced accuracy.

5.4.2 Confusion matrix of the best stacked model

We can inspect the confusion matrix of the best stacked model:

p_best <- outputs_individual$averaged_confusion %>% get_confusion_matrix("ensemble", "3 classifiers x 3 sets", display_raw_numbers = F, title = "Confusion matrix - Ensemble 3 classifiers x 3 sets")
p_best

We can display the previous matrix side by side with two other confusion matrices shown previously to better understand how the performances are improved:

grid.arrange(p_best, p_xgboost_MFCC, p_svm_DCT, ncol = 2, nrow = 2)

We can see that the performances with the best stacked model are improved for all the individuals, without any obvious differential effect.

5.5 Comparing the performances to a random baseline

We first load the results of the 1,000 repetitions with shuffling / random permutation, and create different data tables…

outputs_individual_shuffled <- load_and_prepare_data("Result files/results - individual - shuffled.RData", decreasing_order_nb_calls_by_individual)

We draw the histogram of the values of a metric with the permutation approach, as well as a vertical line representing the mean value for the initial, non-permuted, approach.

For logloss:

perfs_mean_tmp <- outputs_individual$averaged_performances %>%
  filter(algo != "ensemble", measure_name == "log loss") %>%
  select(algo, set, mean)

perfs_tmp <- outputs_individual$performances %>% filter(algo != "ensemble", measure_name == "log loss")

outputs_individual_shuffled$performances %>% 
  filter(measure_name == "log loss") %>%
  ggplot(aes(x = measure)) + 
  geom_histogram(bins = 80) +
  facet_grid(algo ~ set, scale = "free") +
  xlab("log loss") +
  labs(title = "Histograms of the distribution of log loss values", subtitle = "Different classifiers and feature sets") +
  geom_histogram(data = perfs_tmp, aes(x = measure), fill = "pink", bins = 40) +
  geom_segment(data = perfs_mean_tmp, aes(x = mean, y = 0, xend = mean, yend = 100), colour = "red", size = 1)

The histograms do not overlap at all, and are very distant from each other (and the red line representing the mean of the performances without shuffling is very clearly left of the ‘shuffled’ histogram). This means that no logloss value obtained with a random permutation is ever lower than the value obtained with our data. We can therefore reject the null hypothesis of our prediction not being different from chance prediction with a p_value lower than 0.001.

For AUC:

perfs_mean_tmp <- outputs_individual$averaged_performances %>%
  filter(algo != "ensemble", measure_name == "AUC") %>%
  select(algo, set, mean)

perfs_tmp <- outputs_individual$performances %>% filter(algo != "ensemble", measure_name == "AUC")

outputs_individual_shuffled$performances %>%
  filter(measure_name == "AUC") %>%
  ggplot(aes(x = measure)) + 
  geom_histogram(bins = 80) +
  facet_grid(algo ~ set, scale = "free") +
  xlab("AUC") +
  labs(title = "Histograms of the distribution of AUC values", subtitle = "Different classifiers and feature sets") +
  geom_histogram(data = perfs_tmp, aes(x = measure), fill = "pink", bins = 40) +
  geom_segment(data = perfs_mean_tmp, aes(x = mean, y = 0, xend = mean, yend = 100), colour = "red", size = 1)

The situation is symmetrical to the previous one, which makes sense since better performances mean lower logloss but higher AUC. No AUC value obtained with a random permutation is ever higher than the values obtained with our data. We can therefore reject the null hypothesis of our prediction not being different from chance prediction with a p_value lower than 0.001.

We reach the same conclusions with the balanced accuracy metric:

perfs_mean_tmp <- outputs_individual$averaged_performances %>%
  filter(algo != "ensemble", measure_name == "balanced accuracy") %>%
  select(algo, set, mean)

perfs_tmp <- outputs_individual$performances %>% filter(algo != "ensemble", measure_name == "balanced accuracy")

outputs_individual_shuffled$performances %>%
  filter(measure_name == "balanced accuracy") %>%
  ggplot(aes(x = measure)) + 
  geom_histogram(bins = 80) +
  facet_grid(algo ~ set, scale = "free") +
  xlab("AUC") +
  labs(title = "Histograms of the distribution of balanced accuracy values", subtitle = "Different classifiers and feature sets") +
  geom_histogram(data = perfs_tmp, aes(x = measure), fill = "pink", bins = 40) +
  geom_segment(data = perfs_mean_tmp, aes(x = mean, y = 0, xend = mean, yend = 100), colour = "red", size = 1)

We can therefore very confidently conclude that our primary classifiers, svm, nn and xgboost all predict well above chance.

5.6 Displaying and saving a table of results for the publication

In order to prepare a table of results for the publication, we need to import earlier data obtained with the pDFA…

algo_levels <- c("pDFA", "svm", "nn", "xgboost", "ensemble")
set_levels <- c("Bioacoustic", "DCT", "MFCC",
                "3 sets, svm", "3 sets, nn", "3 sets, xgboost",
                "3 classifiers, Bioacoustic set", "3 classifiers, DCT set", "3 classifiers, MFCC set", "3 classifiers x 3 sets")
configs <- c(rep("", 2), paste0("(", 1:9, ")"), rep("", 7))
combined_configs <- c(rep("", 11), "(1+2+3)", "(4+5+6)", "(7+8+9)", "(1+4+7)", "(2+5+8)", "(3+6+9)", "(1+2+3+4+5+6+7+8+9)")

res_pDFA <- readRDS("Result files/pdfa-permuted-resultats01.rds")

pdFA_Bio_logloss <- res_pDFA$individual$Bioacoustic %>% pull(Log.Loss)
pdFA_Bio_auc <- res_pDFA$individual$Bioacoustic %>% pull(AUC)
pdFA_Bio_bac <- res_pDFA$individual$Bioacoustic %>% pull(Bal.Acc)
pdFA_Bio_acc <- res_pDFA$individual$Bioacoustic %>% pull(N.Correct.Test) / res_pDFA$individual$Bioacoustic %>% pull(N.Test)
pdFA_DCT_logloss <- res_pDFA$individual$DCT %>% pull(Log.Loss)
pdFA_DCT_auc <- res_pDFA$individual$DCT %>% pull(AUC)
pdFA_DCT_bac <- res_pDFA$individual$DCT %>% pull(Bal.Acc)
pdFA_DCT_acc <- res_pDFA$individual$DCT %>% pull(N.Correct.Test) / res_pDFA$individual$Bioacoustic %>% pull(N.Test)

We create and display a comprehensive table of results…

output_df <- outputs_individual$averaged_performances %>%
  select(-sd, -classifier_type, -configuration) %>%
  pivot_wider(names_from = measure_name, values_from = mean) %>%
  select(-Kappa, -`mean misclassification error`) %>%
  add_row(algo = "pDFA", set = "Bioacoustic", accuracy = pdFA_Bio_acc, AUC = pdFA_Bio_auc, `balanced accuracy` = pdFA_Bio_bac, `log loss` = pdFA_Bio_logloss) %>%
  add_row(algo = "pDFA", set = "DCT", accuracy = pdFA_DCT_acc, AUC = pdFA_DCT_auc, `balanced accuracy` = pdFA_DCT_bac, `log loss` = pdFA_DCT_logloss) %>%
  mutate(algo = factor(algo, algo_levels), set = factor(set, set_levels)) %>%
  arrange(algo, set) %>%
  mutate(`Config. #`= configs, `Combined config.` = combined_configs) %>%
  select(algo, set, `Config. #`,  `Combined config.`, `log loss`, AUC, accuracy, `balanced accuracy`) %>%
  mutate_if(is.numeric, round, digits = 3) %>%
  rename(Algorithm = "algo", `Feature set` = "set")

output_df %>% kable(align = 'llccrrrrr') %>% kable_classic() %>% kable_styling(full_width = T, bootstrap_options = c("striped"))
Algorithm Feature set Config. # Combined config. log loss AUC accuracy balanced accuracy
pDFA Bioacoustic 1.745 0.731 0.410 0.236
pDFA DCT 1.698 0.731 0.429 0.235
svm Bioacoustic
1.490 0.840 0.516 0.392
svm DCT
1.567 0.822 0.486 0.364
svm MFCC
1.416 0.855 0.548 0.447
nn Bioacoustic
1.487 0.839 0.508 0.381
nn DCT
1.537 0.829 0.490 0.370
nn MFCC
1.415 0.855 0.536 0.414
xgboost Bioacoustic
1.415 0.850 0.544 0.447
xgboost DCT
1.530 0.827 0.493 0.401
xgboost MFCC
1.366 0.861 0.552 0.449
ensemble 3 sets, svm (1+2+3) 1.240 0.887 0.594 0.495
ensemble 3 sets, nn (4+5+6) 1.264 0.886 0.583 0.466
ensemble 3 sets, xgboost (7+8+9) 1.254 0.885 0.589 0.484
ensemble 3 classifiers, Bioacoustic set (1+4+7) 1.420 0.851 0.542 0.427
ensemble 3 classifiers, DCT set (2+5+8) 1.493 0.835 0.500 0.389
ensemble 3 classifiers, MFCC set (3+6+9) 1.311 0.873 0.571 0.465
ensemble 3 classifiers x 3 sets (1+2+3+4+5+6+7+8+9) 1.210 0.894 0.605 0.507

We further save the table of results as a text file…

write.table(output_df, file = "Result files/performances - individual.txt", sep = "\t", dec = ".", row.names = F, quote = F)

5.7 Preparing a figure for the publication

We create a new barplot which contains additional bar for the pDFA. The metric is balanced accuracy…

res_pDFA <- readRDS("Result files/pdfa-permuted-resultats01.rds")
pdFA_Bio_bac_m <- res_pDFA$individual$Bioacoustic %>% pull(Bal.Acc)
pdFA_Bio_bac_sd <- res_pDFA$individual$Bioacoustic %>% pull(sd.Bal.Acc)
pdFA_DCT_bac_m <- res_pDFA$individual$DCT %>% pull(Bal.Acc)
pdFA_DCT_bac_sd <- res_pDFA$individual$DCT %>% pull(sd.Bal.Acc)

outputs_individual$averaged_performances %>%
  add_row(algo = "pDFA", set = "Bioacoustic", measure_name = "balanced accuracy", configuration = "pDFA - Bioacoustic", classifier_type = "single classifier", mean = pdFA_Bio_bac_m, sd = pdFA_Bio_bac_sd) %>%
  add_row(algo = "pDFA", set = "DCT", measure_name = "balanced accuracy", configuration = "pDFA - DCT", classifier_type = "single classifier", mean = pdFA_DCT_bac_m, sd = pdFA_DCT_bac_sd) %>%
  filter(algo != "ensemble" | configuration == "ensemble - 3 classifiers x 3 sets") %>%
  get_barplots_comparison_all_configurations_by_set("balanced accuracy")

6 Analyzing the performances for call types

We first load the results of the assessments of the different models, and create different data tables…

outputs_type <- load_and_prepare_data("Result files/results - type - default.RData", decreasing_order_nb_calls_by_type)

6.1 Analyzing the performances of the three basic models

6.1.1 Looking at the distribution of values for the performances

We have conducted 100 repetitions of the same sequence: i) create a training-test partition randomly, ii) train a model with the training set, iii) assess the model with the test set. We can look at the distribution of values of the different metrics across the repetitions.

For logloss:

outputs_type$performances %>% get_histograms_9_configurations("log loss")

The distributions are similar in shape to those observed with individual. More repetitions would likely lead to smooth and symmetrical distributions, possibly Gaussian. Once again, the range of variation of the logloss values is quite significant in each configuration, which suggests it was a good idea to consider a number of repetitions.

We reach the same conclusions with multi-class AUC, balanced accuracy and accuracy:

outputs_type$performances %>% get_histograms_9_configurations("AUC")

outputs_type$performances %>% get_histograms_9_configurations("balanced accuracy")

outputs_type$performances %>% get_histograms_9_configurations("accuracy")

6.1.2 Assessing the performances with different metrics

We first focus on logloss, which is the metric which was used for the tuning of the hyper-parameters of our different classifiers. We display the average performances for each classifier and each set of predictors:

outputs_type$averaged_performances %>% get_barplots_comparison_9_configurations("log loss")

We can make a number of observations which differ from those we could make with individual:

  • One can clearly see that MFCC offer worse performances than DCT and Bioacoustic, which are quite close. This is true regardless of the classifying algorithm
  • xgboost offer the best performances for Bioacoustic and DCT, but not for MFCC. nn perform better than svm, although this is marginal for the Bioacoustic set
  • The best performances are obtained with xgboost and Bioacoustic. They are 4.4% better than those of the second best option - xgboost with DCT.

We can confirm these results with two other metrics: multi-class AUC and balanced accuracy

p1 <- outputs_type$averaged_performances %>% get_barplots_comparison_9_configurations("AUC")
p2 <- outputs_type$averaged_performances %>% get_barplots_comparison_9_configurations("balanced accuracy")
grid.arrange(p1, p2, ncol = 2)

While AUC gives nearly the same ranking between the 9 conditions as log loss, there are more differences with balanced accuracy. We can, however, conclude that whatever the metric, xgboost combined with Bioacoustic offers the best performances. MFCC parameters, on the contrary, lead consistently to the worst performances.

We do not observe any specific pattern for the standard deviations.

6.2 Confusion matrices of the three basic models

df_type %>% mutate(n = text_spec(n, color = "white", align = "r", background = spec_color(n, option = "D", scale_from = c(0, 400)))) %>%
  kbl(digits = 3, escape = F) %>%
  kable_classic() %>%
  kable_styling(full_width = F)
Type n
PY 443
SB 382
B 274
P 255
SCB 206

We can display the confusion matrix for our best case scenario, when xgboost is considered with Bioacoustic:

outputs_type$averaged_confusion %>% get_confusion_matrix("xgboost", "Bioacoustic", display_raw_numbers = T)

The performances are clearly better than with individuals. P is classified extremely well (93% accuracy). We can observe some confusion between B, SB and SCB, and very little confusion between Y and PY. There is no correlation between the number of calls per call type and the by_type classification accuracy.

We can also display confusion matrices for our 9 configurations:

outputs_type$averaged_confusion %>% get_confusion_matrix_9_configurations()

The patterns of correct and incorrect classifications seem to be the same in all 9 configurations. There does not seem to be a differential in terms of improvement for types with more or less calls. To further investigate, we can draw side by side the two confusion matrices for the best (xgboost - Bioacoustic) and worst (nn - MFCC) configurations:

p_xgboost_Bioacoustic <- outputs_type$averaged_confusion %>% get_confusion_matrix("xgboost", "Bioacoustic", display_raw_numbers = T)
p_nn_MFCC <- outputs_type$averaged_confusion %>% get_confusion_matrix("nn", "MFCC", display_raw_numbers = T)
grid.arrange(p_xgboost_Bioacoustic, p_nn_MFCC, ncol = 2)

We can see that P is especially impacted by choosing the weaker configuration - it is way more often confused with PY. The confusion between B, SB and SCB is also greater.

6.3 Investigating feature importance

For each set of predictors, we can look at the importance of the various features for different classifying algorithms.

We first look at the DCT set:

p1 <- outputs_type$averaged_fimp %>% get_fimp_barplot("svm", "DCT")
p2 <- outputs_type$averaged_fimp %>% get_fimp_barplot("xgboost", "DCT")
p3 <- outputs_type$averaged_fimp %>% get_fimp_barplot("nn", "DCT")
grid.arrange(p1, p2, p3, ncol = 3)

The 3 algorithms strongly suggest that duration and dct2 playing by far the most important roles. dct4 comes next, ahead of other predictors.

We then look at the Bioacoustic set:

p1 <- outputs_type$averaged_fimp %>% get_fimp_barplot("svm", "Bioacoustic")
p2 <- outputs_type$averaged_fimp %>% get_fimp_barplot("xgboost", "Bioacoustic")
p3 <- outputs_type$averaged_fimp %>% get_fimp_barplot("nn", "Bioacoustic")
grid.arrange(p1, p2, p3, ncol = 3)

We observe that duration is consistently and by far the most important feature for prediction of the type. f0_onset, f0_offset, f0.slope.asc and f0.slop.desc come next.

p_a <- outputs_type$averaged_fimp %>% get_fimp_barplot("xgboost", "Bioacoustic")
p_b <- outputs_type$averaged_fimp %>% get_fimp_barplot("xgboost", "DCT")
grid.arrange(p_a, p_b, ncol = 2)

6.4 Analyzing the performances of the ensemble models

6.4.1 Assessing the performances with different metrics

We can look at the performances of our different ensemble models, and compare them to those of our initial / primary classifiers. We first focus on logloss, since it is the metric which was used during the tuning of the hyper-parameters of the different classifiers:

outputs_type$averaged_performances %>% get_barplots_comparison_all_configurations_by_classifier_type("log loss")

Several points can be highlighted:

  • ensembles / stacked learners tend to perform better than single classifiers
  • When it comes to combining the different primary classifiers for a given set of predictors, we observe that the performances of the stacked learners are better for MFCC and DCT, but not for MFCC (xgboost with Bioacoustic is a bit better).
  • When we assemble the various versions of a classifier for different sets of predictors, we very strongly improve the performances. This improvement is stronger than what we observe when we assemble across the classifiers.
  • Finally, when we assemble all 9 primary learners (3 classifiers x 3 sets), we obtain the best performances.

We can compute and display the increase in performance with respect to the worst configuration (nn and MFCC):

outputs_type$averaged_performances %>%
  filter(measure_name == "log loss") %>%
  mutate(improvement_percentage = round(100 * (max(mean) - mean) / max(mean), 2)) %>%
  ggplot(aes(x = reorder(configuration, improvement_percentage), y = improvement_percentage, fill = classifier_type)) +
  geom_bar(stat = "identity", col = "black", position = position_dodge()) +
  geom_text(aes(label = round(improvement_percentage, 1)), hjust = 1.2, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
  labs(fill = "Classifying algorithm") +
  theme_minimal() + coord_flip() + ylab("log loss") + xlab("Feature set") + ggtitle("Comparison of the performances with the log loss metric")

The best classifier is nearly 40% better than the worst one.

We can further look at other metrics:

outputs_type$averaged_performances %>% get_barplots_comparison_all_configurations_by_classifier_type("AUC")

outputs_type$averaged_performances %>% get_barplots_comparison_all_configurations_by_classifier_type("balanced accuracy")

AUC is very consistent with log loss. Balanced accuracy is also quite strongly consistent.

6.4.2 Confusion matrix of the best stacked model

We can inspect the confusion matrix of the best stacked model:

p_best <- outputs_type$averaged_confusion %>% get_confusion_matrix("ensemble", "3 classifiers x 3 sets", display_raw_numbers = F, title = "Confusion matrix - Ensemble 3 classifiers x 3 sets")
p_best

We can further display the previous matrix side by side with two other confusion matrices shown previously to better understand how the performances are improved:

grid.arrange(p_best, p_xgboost_Bioacoustic, p_nn_MFCC, ncol = 2, nrow = 2)

We can see that the performances with the best stacked model are improved for all the types but P compared to the best non-ensemble learner.

6.5 Comparing the performances to a random baseline

We first load the results of the 1,000 repetitions with shuffling / random permutation, and create different data tables…

outputs_type_shuffled <- load_and_prepare_data("Result files/results - type - shuffled.RData", decreasing_order_nb_calls_by_type)

We draw the histogram of the values of a metric with the permutation approach, as well as a vertical line representing the mean value for the initial, non-permuted, approach.

For logloss:

perfs_mean_tmp <- outputs_type$averaged_performances %>%
  filter(algo != "ensemble", measure_name == "log loss") %>%
  select(algo, set, mean)

perfs_tmp <- outputs_type$performances %>% filter(algo != "ensemble", measure_name == "log loss")

outputs_type_shuffled$performances %>% 
  filter(measure_name == "log loss") %>%
  ggplot(aes(x = measure)) + 
  geom_histogram(bins = 80) +
  facet_grid(algo ~ set, scale = "free") +
  xlab("log loss") +
  labs(title = "Histograms of the distribution of log loss values", subtitle = "Different classifiers and feature sets") +
  geom_histogram(data = perfs_tmp, aes(x = measure), fill = "pink", bins = 40) +
  geom_segment(data = perfs_mean_tmp, aes(x = mean, y = 0, xend = mean, yend = 100), colour = "red", size = 1)

The histograms do not overlap at all, and are very distant from each other (and the red line representing the mean of the performances without shuffling is very clearly left of the ‘shuffled’ histogram). This means that no log loss value obtained with a random permutation is ever lower than the value obtained with our data. We can therefore reject the null hypothesis of our prediction not being different from chance prediction with a p_value lower than 0.001.

For AUC:

perfs_mean_tmp <- outputs_type$averaged_performances %>%
  filter(algo != "ensemble", measure_name == "AUC") %>%
  select(algo, set, mean)

perfs_tmp <- outputs_type$performances %>% filter(algo != "ensemble", measure_name == "AUC")

outputs_type_shuffled$performances %>%
  filter(measure_name == "AUC") %>%
  ggplot(aes(x = measure)) + 
  geom_histogram(bins = 80) +
  facet_grid(algo ~ set, scale = "free") +
  xlab("AUC") +
  labs(title = "Histograms of the distribution of AUC values", subtitle = "Different classifiers and feature sets") +
  geom_histogram(data = perfs_tmp, aes(x = measure), fill = "pink", bins = 40) +
  geom_segment(data = perfs_mean_tmp, aes(x = mean, y = 0, xend = mean, yend = 100), colour = "red", size = 1)

We reach the same conclusions with the balanced accuracy metric:

perfs_mean_tmp <- outputs_type$averaged_performances %>%
  filter(algo != "ensemble", measure_name == "balanced accuracy") %>%
  select(algo, set, mean)

perfs_tmp <- outputs_type$performances %>% filter(algo != "ensemble", measure_name == "balanced accuracy")

outputs_type_shuffled$performances %>%
  filter(measure_name == "balanced accuracy") %>%
  ggplot(aes(x = measure)) + 
  geom_histogram(bins = 80) +
  facet_grid(algo ~ set, scale = "free") +
  xlab("AUC") +
  labs(title = "Histograms of the distribution of balanced accuracy values", subtitle = "Different classifiers and feature sets") +
  geom_histogram(data = perfs_tmp, aes(x = measure), fill = "pink", bins = 40) +
  geom_segment(data = perfs_mean_tmp, aes(x = mean, y = 0, xend = mean, yend = 100), colour = "red", size = 1)

The situation is symmetrical to the previous one, which makes sense since better performances mean lower logloss but higher AUC. No AUC value obtained with a random permutation is ever higher than the values obtained with our data. We can therefore reject the null hypothesis of our prediction not being different from chance prediction with a p_value lower than 0.001. We reach the same conclusions with the balanced accuracy metric.

We can therefore very confidently conclude that our primary classifiers, svm, nn and xgboost all predict well above chance.

6.6 Displaying and saving a table of results for the publication

In order to prepare a table of results for the publication, we need to import earlier data obtained with the pDFA…

algo_levels <- c("pDFA", "svm", "nn", "xgboost", "ensemble")
set_levels <- c("Bioacoustic", "DCT", "MFCC",
                "3 sets, svm", "3 sets, nn", "3 sets, xgboost",
                "3 classifiers, Bioacoustic set", "3 classifiers, DCT set", "3 classifiers, MFCC set", "3 classifiers x 3 sets")
configs <- c(rep("", 2), paste0("(", 1:9, ")"), rep("", 7))
combined_configs <- c(rep("", 11), "(1+2+3)", "(4+5+6)", "(7+8+9)", "(1+4+7)", "(2+5+8)", "(3+6+9)", "(1+2+3+4+5+6+7+8+9)")

res_pDFA <- readRDS("Result files/pdfa-permuted-resultats01.rds")

pdFA_Bio_logloss <- res_pDFA$type$Bioacoustic %>% pull(Log.Loss)
pdFA_Bio_auc <- res_pDFA$type$Bioacoustic %>% pull(AUC)
pdFA_Bio_bac <- res_pDFA$type$Bioacoustic %>% pull(Bal.Acc)
pdFA_Bio_acc <- res_pDFA$type$Bioacoustic %>% pull(N.Correct.Test) / res_pDFA$type$Bioacoustic %>% pull(N.Test)
pdFA_DCT_logloss <- res_pDFA$type$DCT %>% pull(Log.Loss)
pdFA_DCT_auc <- res_pDFA$type$DCT %>% pull(AUC)
pdFA_DCT_bac <- res_pDFA$type$DCT %>% pull(Bal.Acc)
pdFA_DCT_acc <- res_pDFA$type$DCT %>% pull(N.Correct.Test) / res_pDFA$type$Bioacoustic %>% pull(N.Test)

We create and display a comprehensive table of results…

output_df <- outputs_type$averaged_performances %>%
  select(-sd, -classifier_type, -configuration) %>%
  pivot_wider(names_from = measure_name, values_from = mean) %>%
  select(-Kappa, -`mean misclassification error`) %>%
  add_row(algo = "pDFA", set = "Bioacoustic", accuracy = pdFA_Bio_acc, AUC = pdFA_Bio_auc, `balanced accuracy` = pdFA_Bio_bac, `log loss` = pdFA_Bio_logloss) %>%
  add_row(algo = "pDFA", set = "DCT", accuracy = pdFA_DCT_acc, AUC = pdFA_DCT_auc, `balanced accuracy` = pdFA_DCT_bac, `log loss` = pdFA_DCT_logloss) %>%
  mutate(algo = factor(algo, algo_levels), set = factor(set, set_levels)) %>%
  arrange(algo, set) %>%
  mutate(`Config. #`= configs, `Combined config.` = combined_configs) %>%
  select(algo, set, `Config. #`,  `Combined config.`, `log loss`, AUC, accuracy, `balanced accuracy`) %>%
  mutate_if(is.numeric, round, digits = 3) %>%
  rename(Algorithm = "algo", `Feature set` = "set")

output_df %>% kable(align = 'llccrrrrr') %>% kable_classic() %>% kable_styling(full_width = T, bootstrap_options = c("striped"))
Algorithm Feature set Config. # Combined config. log loss AUC accuracy balanced accuracy
pDFA Bioacoustic 0.819 0.894 0.651 0.588
pDFA DCT 0.813 0.901 0.674 0.596
svm Bioacoustic
0.671 0.931 0.736 0.718
svm DCT
0.658 0.934 0.745 0.732
svm MFCC
0.810 0.906 0.664 0.672
nn Bioacoustic
0.674 0.933 0.728 0.702
nn DCT
0.681 0.932 0.724 0.701
nn MFCC
0.890 0.888 0.621 0.625
xgboost Bioacoustic
0.627 0.940 0.755 0.747
xgboost DCT
0.651 0.936 0.732 0.720
xgboost MFCC
0.830 0.901 0.654 0.659
ensemble 3 sets, svm (1+2+3) 0.581 0.952 0.785 0.784
ensemble 3 sets, nn (4+5+6) 0.604 0.950 0.773 0.767
ensemble 3 sets, xgboost (7+8+9) 0.578 0.952 0.785 0.784
ensemble 3 classifiers, Bioacoustic set (1+4+7) 0.640 0.939 0.759 0.744
ensemble 3 classifiers, DCT set (2+5+8) 0.648 0.938 0.752 0.736
ensemble 3 classifiers, MFCC set (3+6+9) 0.792 0.912 0.675 0.683
ensemble 3 classifiers x 3 sets (1+2+3+4+5+6+7+8+9) 0.542 0.957 0.794 0.794

We further save the table of results as a text file…

write.table(output_df, file = "Result files/performances - type.txt", sep = "\t", dec = ".", row.names = F, quote = F)

6.7 Figure for the paper

Another representation for balanced accuracy with added rows for pdfA:

res_pDFA <- readRDS("Result files/pdfa-permuted-resultats01.rds")
pdFA_Bio_bac_m <- res_pDFA$type$Bioacoustic %>% pull(Bal.Acc)
pdFA_Bio_bac_sd <- res_pDFA$type$Bioacoustic %>% pull(sd.Bal.Acc)
pdFA_DCT_bac_m <- res_pDFA$type$DCT %>% pull(Bal.Acc)
pdFA_DCT_bac_sd <- res_pDFA$type$DCT %>% pull(sd.Bal.Acc)

outputs_type$averaged_performances %>%
  add_row(algo = "pDFA", set = "Bioacoustic", measure_name = "balanced accuracy", configuration = "pDFA - Bioacoustic", classifier_type = "single classifier", mean = pdFA_Bio_bac_m, sd = pdFA_Bio_bac_sd) %>%
  add_row(algo = "pDFA", set = "DCT", measure_name = "balanced accuracy", configuration = "pDFA - DCT", classifier_type = "single classifier", mean = pdFA_DCT_bac_m, sd = pdFA_DCT_bac_sd) %>%
  filter(algo != "ensemble" | configuration == "ensemble - 3 classifiers x 3 sets") %>%
  get_barplots_comparison_all_configurations_by_set("balanced accuracy")

7 Dealing with data leakage: impact of the organization of calls in sequences

We focus on a specific case of possible data leakage, which relates to the non-independence of calls due to their organization in sequences. We want to compare classification performances when i) we do not do anything regarding how sequences overlap the training and test sets (default strategy), ii) we minimize how sequences overlap the training and test sets (fair strategy) (i.e., the calls of a sequence tend to be either all in the training set of all in the test set), and iii) we maximize how sequences overlap the training and test sets (skewed strategy) (i.e., the calls of a sequence as distributed as equally as possible between the training set and the test set).

7.1 Comparing performances between different optimization of sequences

7.1.1 Loading and preparing data

We first load the results of the assessments of the different configurations, and create different data tables…

outputs_individual_default <- load_and_prepare_data("Result files/results - individual - default.RData", decreasing_order_nb_calls_by_individual)
outputs_individual_skewed <- load_and_prepare_data("Result files/results - individual - skewed.RData", decreasing_order_nb_calls_by_individual)
outputs_individual_fair <- load_and_prepare_data("Result files/results - individual - fair.RData", decreasing_order_nb_calls_by_individual)

We define a function combine_tables(), which combines different tables of data corresponding to different strategies (default, fair, skewed) into a single table…

combine_tables <- function(dt_default, dt_skewed, dt_fair) {
  dt_default <- dt_default %>% mutate(sequence_optim = "Default")
  dt_skewed <- dt_skewed %>% mutate(sequence_optim = "Skewed")
  dt_fair <- dt_fair %>% mutate(sequence_optim = "Fair")
  
  dt_all <- dt_default %>%
    rbind(dt_skewed) %>%
    rbind(dt_fair)
  
  return (dt_all)
}

We now call the previous function for the two tables corresponding to classification performances and feature importance…

performances_individual_all <-
  combine_tables(outputs_individual_default$performances, outputs_individual_skewed$performances, outputs_individual_fair$performances) %>%
  filter(algo != "ensemble") 

averaged_fimp_individual_all <-
  combine_tables(outputs_individual_default$averaged_fimp, outputs_individual_skewed$averaged_fimp, outputs_individual_fair$averaged_fimp) %>%
  filter(algo != "ensemble")

We finally need to combine the data regarding the quality/score of the three strategies - how much overlap there is…

sequence_scores_default <- tibble(sequence_score = outputs_individual_default$sequence_scores, sequence_optim = "Default")
sequence_scores_skewed <- tibble(sequence_score = outputs_individual_skewed$sequence_scores, sequence_optim = "Skewed")
sequence_scores_fair <- tibble(sequence_score = outputs_individual_fair$sequence_scores, sequence_optim = "Fair")

sequence_scores <- rbind(sequence_scores_default, sequence_scores_skewed, sequence_scores_fair)

7.1.2 Distance to the optimum

We can display, for each strategy, how far the pairs of sets are from an optimum in terms of sequence distribution, i.e., a case where no sequence is distributed across the two sets:

hist_distance_to_optimum <- sequence_scores %>% ggplot(aes(x = sequence_score, fill = sequence_optim)) + 
  geom_histogram(bins = 220) +
  scale_fill_manual(values=c("#f9cb9c", "#b6d7a8", "#980000")) +
  xlab("Distance to the optimum") + labs(fill = "Strategy") +
  xlim(0, 350) +
  labs(title = "Distance to the optimum in terms of sequence overlapping", subtitle = "Histogram for 100 runs")
hist_distance_to_optimum

We can clearly see that the fair strategy has the least sequence overlapping, and the skewed strategy has the most, as expected. The default strategy is closer to the latter in terms of distance to the optimum, which may suggest that doing nothing might not be such a good idea.

7.1.3 Kernel density estimates of performances

We wish to know whether sequences overlapping the training and test sets lead to higher classification performances because the classifiers take advantage of the resemblances between calls of the same sequence. Conversely, we wish to know whether restricting sequences to a single set - training or test - as much as possible leads to worse performances.

We display the distribution of performances as assessed with the log loss metric, highlighting the differences between the ‘fair’, ‘skewed’ and ‘default’ conditions:

performances_individual_all %>% 
  filter(measure_name == "log loss") %>%
  ggplot(aes(x = measure, fill = sequence_optim)) + 
  geom_density(alpha = 0.25) +
  facet_grid(algo ~ set) +
  xlab("log loss") +
  labs(title = "Kernel density estimates - log loss values", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Strategy")

We observe that the performances for ‘default’ and ‘fair’ are quite close, although the distribution for ‘fair’ seems to be slightly more on the right, i.e. correspond to higher values of log loss and therefore to worse performances. The distribution for ‘skewed’ clearly correspond to lower values of log loss, i.e. to better performances. This supports our aforementioned hypothesis, although weakly.

We can repeat the same approach with balanced accuracy and AUC as the measures of performance:

p_perf_hist <- performances_individual_all %>% 
  filter(measure_name == "balanced accuracy") %>%
  ggplot(aes(x = measure, fill = sequence_optim)) + 
  geom_density(alpha = 0.25) +
  facet_grid(algo ~ set) +
  xlab("Balanced accuracy") +
  labs(title = "Kernel density estimates - balanced accuracy", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Strategy")

p_perf_hist

performances_individual_all %>% 
  filter(measure_name == "AUC") %>%
  ggplot(aes(x = measure, fill = sequence_optim)) + 
  geom_density(alpha = 0.25) +
  facet_grid(algo ~ set) +
  xlab("AUC") +
  labs(title = "Kernel density estimates - AUC values", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Strategy")

What we observe leads to exactly the same conclusion as with log loss.

7.1.4 Barplots of mean performances

We compute the average performances for all the different configurations…

mean_performances_individual_all <- performances_individual_all %>%
  group_by(sequence_optim, algo, set, measure_name) %>%
  summarize(mean = mean(measure), sd = sd(measure), .groups = "drop") %>%
  mutate(sequence_optim = fct_relevel(sequence_optim, c("Fair", "Default", "Skewed")))

We display the average performances for the different sets of predictors and classifiers so that we can highlight the differences between our different strategies for the sequences of calls:

mean_performances_individual_all %>% filter(measure_name == "log loss") %>%
  ggplot(aes(x = sequence_optim, y = mean, fill = algo)) +
  geom_bar(stat = "identity", col = "black", position = position_dodge()) +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
  geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
  theme_minimal() + coord_flip() + ylab("log loss") + xlab("Strategies") +
  labs(title = "Average performances - log loss", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Classifier") +
  facet_grid(algo ~ set)

We can see clearly that the performances drop with the ‘fair’ strategy, although the drop is not very large. The difference is much smaller between ‘default’ and ‘fair’.

We can confirm our observations with AUC and balanced accuracy:

mean_performances_individual_all %>% filter(measure_name == "AUC") %>%
  ggplot(aes(x = sequence_optim, y = mean, fill = algo)) +
  geom_bar(stat = "identity", col = "black", position = position_dodge()) +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
  geom_text(aes(label = round(mean, 3)), hjust = 2.0, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
  theme_minimal() + coord_flip() + ylab("AUC") + xlab("Strategies") +
  labs(title = "Average performances - AUC", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Classifying algorithm") +
  facet_grid(algo ~ set)

p_perf_mean <- mean_performances_individual_all %>% filter(measure_name == "balanced accuracy") %>%
  ggplot(aes(x = sequence_optim, y = mean, fill = algo)) +
  geom_bar(stat = "identity", col = "black", position = position_dodge()) +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
  geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
  theme_minimal() + coord_flip() + ylab("Balanced accuracy") + xlab("Strategies") +
  labs(title = "Average performances - balanced accuracy", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Classifier") +
  facet_grid(algo ~ set)

plot(p_perf_mean)

We get exactly the same results.

We can conclude that paying attention to the sequences does not deeply modify the results we get when it comes to performances. We may wonder, however, whether the importance of the different predictors is impacted more significantly.

We create a figure for the publication…

p_both <- grid.arrange(hist_distance_to_optimum, p_perf_mean, ncol = 2)

7.1.5 Feature importance

We further compute the average importance of the predictors for all all the different configurations…

averaged_fimp_individual_all <- averaged_fimp_individual_all %>% mutate(sequence_optim = fct_relevel(sequence_optim, c("Fair", "Default", "Skewed")))

We display the average importance for the different classifiers and the different predictors of the DCT set, so that we can highlight the differences between our different strategies for the sequences of calls:

averaged_fimp_individual_all %>%
  filter(set == "DCT") %>% droplevels() %>%
  ggplot(aes(x = sequence_optim, y = mean)) +
  geom_bar(stat = "identity", col = "black") +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2) +
  geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3) +
  xlab("Feature") + ylab("Importance") +
  theme_minimal() + coord_flip() +
  ggtitle("Importance of DCT coefficients") +
  theme(plot.title = element_text(size = 10, face = "bold")) +
  facet_grid(feature ~ algo)

We do not observe drastic changes between the three configurations.

We can check in a similar way the predictors of the Bioacoustic set:

averaged_fimp_individual_all %>%
  filter(set == "Bioacoustic") %>% droplevels() %>%
  ggplot(aes(x = sequence_optim, y = mean)) +
  geom_bar(stat = "identity", col = "black") +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2) +
  geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3) +
  xlab("Feature") + ylab("Importance") +
  theme_minimal() + coord_flip() +
  ggtitle("Importance of Bioacoustic coefficients") +
  theme(plot.title = element_text(size = 10, face = "bold")) +
  facet_grid(feature ~ algo)

Once again, the variation we observe between the three strategies is quite limited.

7.2 Long sequences only

One may wonder if the previous results, with a limited impact of the management strategy of the sequences of calls, could provide a biased perspective on the issue of data leakage because of the large number of sequences with only 1 or 2 calls in our dataset. During the curation phase, we thus created a second dataset which contains only calls appearing in sequences of at least 3 calls, and we can repeat the same procedure as above. We may thus offer a complementary view on the question of interest.

7.2.1 Loading and preparing data

We first load the results of the assessments of the different configurations, and create different data tables…

outputs_individual_default <- load_and_prepare_data("Result files/results - individual - long sequences - default.RData", decreasing_order_nb_calls_by_individual)
outputs_individual_skewed <- load_and_prepare_data("Result files/results - individual - long sequences - skewed.RData", decreasing_order_nb_calls_by_individual)
outputs_individual_fair <- load_and_prepare_data("Result files/results - individual - long sequences - fair.RData", decreasing_order_nb_calls_by_individual)

We now call the function combine_tables() for the two tables corresponding to classification performances and feature importance…

performances_individual_all <-
  combine_tables(outputs_individual_default$performances, outputs_individual_skewed$performances, outputs_individual_fair$performances) %>%
  filter(algo != "ensemble") 

averaged_fimp_individual_all <-
  combine_tables(outputs_individual_default$averaged_fimp, outputs_individual_skewed$averaged_fimp, outputs_individual_fair$averaged_fimp) %>%
  filter(algo != "ensemble")

We combine the data regarding the quality/score of the three strategies…

sequence_scores_default <- tibble(sequence_score = outputs_individual_default$sequence_scores, sequence_optim = "default")
sequence_scores_skewed <- tibble(sequence_score = outputs_individual_skewed$sequence_scores, sequence_optim = "skewed")
sequence_scores_fair <- tibble(sequence_score = outputs_individual_fair$sequence_scores, sequence_optim = "fair")

sequence_scores <- rbind(sequence_scores_default, sequence_scores_skewed, sequence_scores_fair)

7.2.2 Distance to the optimum

We can display for the three strategies how far the pairs of sets are from an optimum in terms of sequence distribution:

hist_distance_to_optimum <- sequence_scores %>% ggplot(aes(x = sequence_score, fill = sequence_optim)) + 
  geom_histogram(bins = 220) +
  scale_fill_manual(values=c("#f9cb9c", "#b6d7a8", "#980000")) +
  xlab("Distance to the optimum") + labs(fill = "Strategy") +
  xlim(0, 350) +
  labs(title = "Distance to the optimum in terms of sequence overlapping (long sequences)", subtitle = "Histogram for 100 runs")
hist_distance_to_optimum

In comparison to the previous case where even very short sequences were considered, we observe that the default strategy is much closer to the skewed strategy. Optimizing the distribution of sequences across the training and set sets leads to significantly different distributions of the calls in these sets.

7.2.3 Kernel density estimates of performances

We display the distribution of performances as assessed with the log loss metric, highlighting the differences between the three strategies:

performances_individual_all %>% 
  filter(measure_name == "log loss") %>%
  ggplot(aes(x = measure, fill = sequence_optim)) + 
  geom_density(alpha = 0.25) +
  facet_grid(algo ~ set) +
  xlab("log loss") +
  labs(fill = "Strategy") +
  labs(title = "Kernel density estimates - log loss (long sequences)", subtitle = "Different classifiers, feature sets and overlapping stategies")

We observe a different pattern from the previous one: we see that the default and skewed strategies are very close, while the fair strategy leads to weaker performances.

We can repeat the same approach with balanced accuracy and AUC as the measures of performance:

p_perf_hist <- performances_individual_all %>% 
  filter(measure_name == "balanced accuracy") %>%
  ggplot(aes(x = measure, fill = sequence_optim)) + 
  geom_density(alpha = 0.25) +
  facet_grid(algo ~ set) +
  xlab("Balanced accuracy") +
  labs(fill = "Strategy") +
  labs(title = "Kernel density estimates - balanced accuracy (long sequences)", subtitle = "Different classifiers, feature sets and overlapping stategies")

plot(p_perf_hist)

performances_individual_all %>% 
  filter(measure_name == "AUC") %>%
  ggplot(aes(x = measure, fill = sequence_optim)) + 
  geom_density(alpha = 0.25) +
  facet_grid(algo ~ set) +
  xlab("AUC") +
  labs(fill = "Strategy") +
  labs(title = "Kernel density estimates - AUC (long sequences)", subtitle = "Different classifiers, feature sets and overlapping stategies")

The patterns are similar to those we described with log loss.

7.2.4 Barplots of mean performances

We compute the average performances for all the different configurations…

mean_performances_individual_all <- performances_individual_all %>%
  group_by(sequence_optim, algo, set, measure_name) %>%
  summarize(mean = mean(measure), sd = sd(measure), .groups = "drop") %>%
  mutate(sequence_optim = fct_relevel(sequence_optim, c("Fair", "Default", "Skewed")))

We display the average performances for the different sets of predictors and classifiers so that we can highlight the differences between our different strategies for the sequences of calls:

mean_performances_individual_all %>% filter(measure_name == "log loss") %>%
  ggplot(aes(x = sequence_optim, y = mean, fill = algo)) +
  geom_bar(stat = "identity", col = "black", position = position_dodge()) +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
  geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
  theme_minimal() + coord_flip() + ylab("log loss") + xlab("Strategies") +
  labs(title = "Average performances - log loss (long sequences)", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Classifier") +
  facet_grid(algo ~ set)

We can see clearly that the performances drop with the ‘fair’ strategy, with a greater drop than when all calls were considered.

We can confirm our observations with AUC and balanced accuracy:

mean_performances_individual_all %>% filter(measure_name == "AUC") %>%
  ggplot(aes(x = sequence_optim, y = mean, fill = algo)) +
  geom_bar(stat = "identity", col = "black", position = position_dodge()) +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
  geom_text(aes(label = round(mean, 3)), hjust = 2.0, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
  theme_minimal() + coord_flip() + ylab("AUC") + xlab("Strategies") +
  labs(title = "Average performances - AUC (long sequences)", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Classifying algorithm") +
  facet_grid(algo ~ set)

p_perf_mean <- mean_performances_individual_all %>% filter(measure_name == "balanced accuracy") %>%
  ggplot(aes(x = sequence_optim, y = mean, fill = algo)) +
  geom_bar(stat = "identity", col = "black", position = position_dodge()) +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
  geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
  theme_minimal() + coord_flip() + ylab("Balanced accuracy") + xlab("Strategies") +
  labs(title = "Average performances - balanced accuracy (long sequences)", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Classifier") +
  facet_grid(algo ~ set)

plot(p_perf_mean)

In conclusion, if we focus on longer sequences, there is a greater impact of not accounting for the data leakage which results from sequences of calls appearing both in the training and test set. This should be a word of caution when dealing with other patterns of non-independence of the observations.

We create a figure for the publication…

p_both <- grid.arrange(hist_distance_to_optimum, p_perf_mean, ncol = 2)

7.2.5 Feature importance

We further compute the average importance of the predictors for all all the different configurations…

averaged_fimp_individual_all <- averaged_fimp_individual_all %>% mutate(sequence_optim = fct_relevel(sequence_optim, c("Fair", "Default", "Skewed")))

We display the average importance for the different classifiers and the different predictors of the DCT set, so that we can highlight the differences between our different strategies for the sequences of calls:

averaged_fimp_individual_all %>%
  filter(set == "DCT") %>% droplevels() %>%
  ggplot(aes(x = sequence_optim, y = mean)) +
  geom_bar(stat = "identity", col = "black") +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2) +
  geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3) +
  xlab("Feature") + ylab("Importance") +
  theme_minimal() + coord_flip() +
  ggtitle("Importance of DCT coefficients (long sequences only)") +
  theme(plot.title = element_text(size = 10, face = "bold")) +
  facet_grid(feature ~ algo)

We do not observe drastic changes between the three configurations.

We can check in a similar way the predictors of the Bioacoustic set:

averaged_fimp_individual_all %>%
  filter(set == "Bioacoustic") %>% droplevels() %>%
  ggplot(aes(x = sequence_optim, y = mean)) +
  geom_bar(stat = "identity", col = "black") +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2) +
  geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3) +
  xlab("Feature") + ylab("Importance") +
  theme_minimal() + coord_flip() +
  ggtitle("Importance of Bioacoustic coefficients (long sequences only)") +
  theme(plot.title = element_text(size = 10, face = "bold")) +
  facet_grid(feature ~ algo)

Once again, the variation we observe between the three strategies is quite limited.

8 Environment

cat(utils::sessionInfo()$R.version$version.string, "<br />", sep = "")

R version 4.1.3 (2022-03-10)

cat("Platform: ", utils::sessionInfo()$platform, "<br />", sep = "")

Platform: x86_64-w64-mingw32/x64 (64-bit)

cat("OS version: ", utils::sessionInfo()$running, "<br />", sep = "")

OS version: Windows 10 x64 (build 22000)

9 Packages

packages.base <- utils::sessionInfo()$basePkgs
packages.others <- names(utils::sessionInfo()$otherPkgs)

packages <- c(packages.base, packages.others)

for(i in 1:length(packages)) {
  reference <- cat("`",
                  packages[i],
                  "`: ",
                  "*",
                  packageDescription(packages[i])$Title,
                  "* version ",
                  packageDescription(packages[i])$Version,
                  "<br />",
                  sep = "")
  reference
}

stats: The R Stats Package version 4.1.3
graphics: The R Graphics Package version 4.1.3
grDevices: The R Graphics Devices and Support for Colours and Fonts version 4.1.3
utils: The R Utils Package version 4.1.3
datasets: The R Datasets Package version 4.1.3
methods: Formal Methods and Classes version 4.1.3
base: The R Base Package version 4.1.3
kableExtra: Construct Complex Table with ‘kable’ and Pipe Syntax version 1.3.4
rlist: A Toolbox for Non-Tabular Data Manipulation version 0.4.6.2
mlr: Machine Learning in R version 2.19.0
ParamHelpers: Helpers for Parameters in Black-Box Optimization, Tuning and Machine Learning version 1.14
gridExtra: Miscellaneous Functions for “Grid” Graphics version 2.3
caret: Classification and Regression Training version 6.0-90
lattice: Trellis Graphics for R version 0.20-45
reshape2: Flexibly Reshape Data: A Reboot of the Reshape Package version 1.4.4
forcats: Tools for Working with Categorical Variables (Factors) version 0.5.1
stringr: Simple, Consistent Wrappers for Common String Operations version 1.4.0
dplyr: A Grammar of Data Manipulation version 1.0.8
purrr: Functional Programming Tools version 0.3.4
readr: Read Rectangular Text Data version 2.1.2
tidyr: Tidy Messy Data version 1.2.0
tibble: Simple Data Frames version 3.1.6
ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics version 3.3.5
tidyverse: Easily Install and Load the ‘Tidyverse’ version 1.3.1
knitr: A General-Purpose Package for Dynamic Report Generation in R version 1.37


  1. Département des arts, des lettres et du langage, Université du Québec à Chicoutimi, Chicoutimi, Canada↩︎

  2. Université de Lyon, CNRS, Laboratoire Dynamique Du Langage, UMR 5596, Lyon, France↩︎

  3. ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎

  4. Département des arts, des lettres et du langage, Université du Québec à Chicoutimi, Chicoutimi, Canada↩︎

  5. ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎

  6. ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎

  7. Department of Linguistics, The University of Hong Kong, Hong Kong, China, Corresponding author↩︎